#packages
library(ggplot2)
library(dplyr)
library(OrgMassSpecR)
library(pmartR)
library(tidyverse)
#utils
here::i_am("Code/protein_sequence_simulation.R")

source(here::here("Code", "ptm_utils.R"))

Protein Simulation using Real Data

We can use real data to generate probabilities for selecting each amino acid

lip_data = readr::read_tsv(here::here("Data", "double_pept_lytic_sites.tsv"))

all_peptides = unlist(lapply(lip_data$Sequence, \(x) strsplit(x, split = '; ')[[1]]))

unique_peptides = unique(all_peptides)

# Lets look at the frequency of each amino acid in the real data
amino_acids = unlist(lapply(unique_peptides, \(x) strsplit(x, split = "")[[1]]))
barplot(table(amino_acids), xlab = "Amino Acids", ylab = "Frequency", main = "Amino Acid Frequency Plot")

Then using those probabilities, sample a synthetic protein with similar structural properties

amino_acid_distribution = create_amino_acid_distribution(amino_acids)
synthetic_protein = generate_protein(1000, amino_acid_distribution)
synthetic_amino_acids = unlist(lapply(synthetic_protein, \(x) strsplit(x, split = "")[[1]]))

barplot(table(synthetic_amino_acids), xlab = "(Synthetic) Amino Acids", ylab = "Frequency", main = "(Synthetic) Amino Acid Frequency Plot\nFrom 1000 sampled amino acids")

Sequence Masking

We can mask regions of a sequence to simulate how proteins are initially digested in LiP. This essentially will introduce an artificial “folding” in the protein. We do this by moving through each amino acid from the start of the sequence, and flag sequential amino acids as a “masked region” until a stopping criteria is reached. This is repeated until the end of the sequence is reached. Here, our stopping criteria is simple: either a specified number of trypsin digestion sites are contained in the region, or a maximum region size are reached. We also add a delay to the end of a masked region so that it doesn’t always end on a trypsin site, and a gap between masked regions which will be left unmasked. We do this randomly by introducing “gap” and “delay” distributions, which can be any function returning a non-negative integer. Here we use simple Poisson distributions.

get_mask_indices = function(
  sequence,
  start, 
  delay_fn, 
  max_masked_region_size, 
  min_num_trypsin_sites
){
  if(start > length(amino_acids)) stop("Start position is larger than sequence length")
  adjusted_start = start + delay_fn()
  curr_ind = adjusted_start
  amino_acids = unlist(lapply(sequence, \(x) strsplit(x, split = "")[[1]]))
  n_trypsin_sites = 0
  region_size = 0
  
  # Check if the adjusted start is valid
  if(adjusted_start > length(amino_acids)){
    return(list(start = start, stop = Inf)) #stop at inf to flag finished procedure (this one will be tossed out)
  }
  
  #check if one of the conditions can be satisfied
  remaining_amino_acids = amino_acids[adjusted_start:length(amino_acids)]
  remaining_spit_sites = sum(grepl("R", remaining_amino_acids) | grepl("K", remaining_amino_acids))
  if(length(remaining_amino_acids) < max_masked_region_size & remaining_spit_sites < min_num_trypsin_sites){
    return(list(start = adjusted_start, stop = Inf)) #stop at inf to flag finished procedure (this one will be tossed out)
  }
  

  while(n_trypsin_sites < min_num_trypsin_sites & region_size < max_masked_region_size){
    curr_amino_acid = amino_acids[curr_ind]
    if(curr_amino_acid == "K" | curr_amino_acid == "R"){
      n_trypsin_sites = n_trypsin_sites + 1
    }
    region_size = region_size + 1
    curr_ind = curr_ind + 1
  }
  end_ind = curr_ind + delay_fn()
  return(list(start = adjusted_start, stop = end_ind))
}

mask_sequence = function(
    sequence,
    delay_fn, 
    gap_fn,
    max_masked_region_size, 
    min_num_trypsin_sites
  ){
  start_vec = c()
  stop_vec = c()
  curr_start = 1 + gap_fn()
  last_stop = 0
  masked_sequence = sequence
  while(last_stop < nchar(sequence)){
    mask_indices = get_mask_indices(
      sequence =sequence,
      start = curr_start,
      delay_fn = delay_fn,
      max_masked_region_size = max_masked_region_size,
      min_num_trypsin_sites = min_num_trypsin_sites
    )
    last_stop = mask_indices$stop
    curr_start = mask_indices$stop + gap_fn()
    if(!(last_stop > nchar(sequence) | curr_start > nchar(sequence))){
      start_vec = c(start_vec, mask_indices$start)
      stop_vec = c(stop_vec, mask_indices$stop)
      masked_sequence = replace_between_indices(masked_sequence, mask_indices$start, mask_indices$stop, '#')
    }
  }
  return(list(sequence = sequence, masked_sequence = masked_sequence, start_vec = start_vec, stop_vec = stop_vec))
}

replace_between_indices = function(x, start, stop, replacement) {
  before = substr(x, 1, start - 1)
  after = substr(x, stop + 1, nchar(x))
  replacement_string = strrep(replacement, stop - start + 1)
  result = paste0(before, replacement_string, after)
  return(result)
}
sample_gap = function() rpois(1, 10)
sample_delay = function() rpois(1, 2)

Gap Distribution

ggplot(data.frame(values = replicate(1000, sample_gap())), aes(x = values)) + 
  geom_histogram(binwidth = 1) + 
  xlab("Gap") + 
  theme_bw()

Delay Distribution

ggplot(data.frame(values = replicate(1000, sample_delay())), aes(x = values)) + 
  geom_histogram(binwidth = 1) + 
  xlab("Delay") +
  theme_bw()

Masking Example

Here we generate a synthetic protein of length 200, and mask it 50 different time. We require at least 2 trypsin digestion sites in the sequence with a maximum masked region size of 50

synthetic_protein = generate_protein(200, amino_acid_distribution)
max_masked_region_size = 50
min_num_trypsin_sites = 2


n_reps = 50
out_list = vector('list', n_reps)
for(i in seq_len(n_reps)){
  out_list[[i]] = mask_sequence(
    sequence = synthetic_protein,
    delay_fn = sample_delay,
    gap_fn = sample_gap,
    max_masked_region_size = max_masked_region_size,
    min_num_trypsin_sites = min_num_trypsin_sites
  )
}
create_sequence_plot = function(list_of_sequences) {
  
  # Get the names of the sequences, or use row numbers if names are not provided
  sequence_names = if (!is.null(names(list_of_sequences))) names(list_of_sequences) else seq_along(list_of_sequences)
  
  plot_data = lapply(seq_along(list_of_sequences), function(i) {
    seq_list = list_of_sequences[[i]]
    sequence = seq_list$sequence
    masked_sequence = seq_list$masked_sequence
    
    data.frame(
      row = i,
      label = sequence_names[i],
      position = 1:nchar(sequence),
      letter = strsplit(sequence, "")[[1]],
      color = ifelse(strsplit(masked_sequence, "")[[1]] == "#", "red", "black")
    )
  }) %>% bind_rows()
  
  ggplot(plot_data, aes(x = position, y = -row, label = letter, color = color)) +
    geom_text(size = 4) +
    geom_text(aes(x = -2, y = -row, label = label), hjust = 1, size = 4, color = "black") +
    scale_color_identity() +
    scale_y_continuous(breaks = -seq_along(list_of_sequences), labels = NULL) +
    expand_limits(x = c(-10, NA)) +
    theme_void() +
    theme(legend.position = "none") +
    coord_cartesian(clip = "off")
}

Below, we display the synthetic sequence along with the masked regions (in red)

create_sequence_plot(out_list)

Inducing Group Effect

There are two obvious ways group effects could be induced, we go over both here.

Downsampling from a “complete” masked sequence

Here, we can simulate a “complete” masked sequence, which has all possible masked regions. Groups are made distinct by downsampling from the complete masked sequence in different ways.

max_masked_region_size = 50
min_num_trypsin_sites = 2


complete_masked_sequence = mask_sequence(
    sequence = synthetic_protein,
    delay_fn = sample_delay,
    gap_fn = sample_gap,
    max_masked_region_size = max_masked_region_size,
    min_num_trypsin_sites = min_num_trypsin_sites
  )

We just do naive downsampling here, that is, a fixed proportion of masked regions are selected from the complete set for each group

downsample_masking = function(
    regions_start, 
    downsample_fn, 
    ...
    ){
  downsample_fn(regions_start, ...)
}

downsample_fn = function(vec, prop) sample(vec, floor(prop * length(vec)))
remask_sequence = function(sequence, start_vec, stop_vec){
  masked_sequence = sequence
  for(i in seq_along(start_vec)){
    masked_sequence = replace_between_indices(masked_sequence, start_vec[i], stop_vec[i], '#')
  }
  return(masked_sequence)
}

g1_indices = downsample_masking(seq_along(complete_masked_sequence$start_vec), downsample_fn, 0.5)
g1_masked_sequence = complete_masked_sequence
g1_masked_sequence$start_vec = g1_masked_sequence$start_vec[g1_indices]
g1_masked_sequence$stop_vec = g1_masked_sequence$stop_vec[g1_indices]
g1_masked_sequence$masked_sequence = remask_sequence(g1_masked_sequence$sequence, g1_masked_sequence$start_vec, g1_masked_sequence$stop_vec)

g2_indices = downsample_masking(seq_along(complete_masked_sequence$start_vec), downsample_fn, 0.5)
g2_masked_sequence = complete_masked_sequence
g2_masked_sequence$start_vec = g2_masked_sequence$start_vec[g2_indices]
g2_masked_sequence$stop_vec = g2_masked_sequence$stop_vec[g2_indices]
g2_masked_sequence$masked_sequence = remask_sequence(g2_masked_sequence$sequence, g2_masked_sequence$start_vec, g2_masked_sequence$stop_vec)
create_sequence_plot(
  list(
    "Complete Masking" = complete_masked_sequence, 
    "Group 1 Masking" = g1_masked_sequence, 
    "Group 2 Masking" = g2_masked_sequence
    )
  )

Adjusting the gap/delay distributions

We can also be a bit more sophisticated and alter the masking generating mechanism itself. This can be done by tweaking 4 parameters:
1. Delay distribution
2. Gap distribution
3. Maximum masked region size
4. Minimum number of trypsin sites per masked region

sample_gap_g1 = function() rpois(1, 5)
sample_delay_g1 = function() rpois(1, 4)
max_masked_region_size = 100
min_num_trypsin_sites = 2

n_reps = 500

masked_mat = matrix('', nrow = n_reps, ncol = nchar(synthetic_protein))
for(i in seq_len(n_reps)){
  masked_sequence = mask_sequence(
    sequence = synthetic_protein,
    delay_fn = sample_delay_g1,
    gap_fn = sample_gap_g1,
    max_masked_region_size = max_masked_region_size,
    min_num_trypsin_sites = min_num_trypsin_sites
  )$masked_sequence
  
  masked_mat[i,] = strsplit(masked_sequence, split = '')[[1]]
}

ggdat_g1 = data.frame(`Location` = seq_len(nchar(synthetic_protein)), Count = colSums(masked_mat == '#'), Group = "1")
sample_gap_g2 = function() rpois(1, 20)
sample_delay_g2 = function() rpois(1, 4)
max_masked_region_size = 100
min_num_trypsin_sites = 3

n_reps = 500

masked_mat = matrix('', nrow = n_reps, ncol = nchar(synthetic_protein))
for(i in seq_len(n_reps)){
  masked_sequence = mask_sequence(
    sequence = synthetic_protein,
    delay_fn = sample_delay_g2,
    gap_fn = sample_gap_g2,
    max_masked_region_size = max_masked_region_size,
    min_num_trypsin_sites = min_num_trypsin_sites
  )$masked_sequence
  
  masked_mat[i,] = strsplit(masked_sequence, split = '')[[1]]
}

ggdat_g2 = data.frame(`Location` = seq_len(nchar(synthetic_protein)), Count = colSums(masked_mat == '#'), Group = "2")

We simulate 500 sets of masked regions for the synthetic protein, and can visualize the different distribution of masked regions across groups

ggplot(rbind(ggdat_g1, ggdat_g2), aes(x = Location, y = Count, color = as.factor(Group))) + 
  geom_line() + 
  guides(color = guide_legend(title = "Group")) + 
  xlab("Amino Acid Index") + 
  theme_bw() + 
  theme(legend.position = 'bottom') 

LiP Stage 1 Digestion

Prototype Proteinase K synthetic digestion

simulate_proteinase_k_cleavage = function(sequence) {
  # Define amino acids considered for cleavage (aliphatic, aromatic, hydrophobic)
  cleavage_aa = c("A", "V", "L", "I", "F", "Y", "W", "M", "P")  # Aliphatic, aromatic, and hydrophobic residues

  # Convert the sequence to a character vector
  sequence_vector = unlist(strsplit(sequence, ""))
  
  # Initialize vectors to store peptide information
  peptides = c()
  start_indices = c()
  end_indices = c()
  
  # Initialize start index
  start = 1
  
  # Loop through the sequence to find potential cleavage sites
  for (i in 1:(length(sequence_vector) - 1)) {
    if (sequence_vector[i] %in% cleavage_aa) {
      peptides = c(peptides, paste(sequence_vector[start:i], collapse = ""))
      start_indices = c(start_indices, start)
      end_indices = c(end_indices, i)
      start = i + 1
    }
  }
  
  # Add the final peptide after the last cleavage site
  if (start <= length(sequence_vector)) {
    peptides = c(peptides, paste(sequence_vector[start:length(sequence_vector)], collapse = ""))
    start_indices = c(start_indices, start)
    end_indices = c(end_indices, length(sequence_vector))
  }
  
  # Create a dataframe
  cleavage_df = data.frame(
    peptide = peptides,
    start = start_indices,
    stop = end_indices,
    stringsAsFactors = FALSE
  )
  
  return(cleavage_df)
}

set.seed(1)
synthetic_protein2 = generate_protein(100, amino_acid_distribution)
result_df <- simulate_proteinase_k_cleavage(synthetic_protein2)
print(result_df)
##      peptide start stop
## 1         EV     1    2
## 2         SF     3    4
## 3          L     5    5
## 4         NY     6    7
## 5          I     8    8
## 6          I     9    9
## 7          A    10   10
## 8          L    11   11
## 9          L    12   12
## 10        TV    13   14
## 11      QDTW    15   18
## 12         V    19   19
## 13        QF    20   21
## 14        EI    22   23
## 15         L    24   24
## 16        EV    25   26
## 17         A    27   27
## 18         V    28   28
## 19    KGRSRL    29   34
## 20         P    35   35
## 21         I    36   36
## 22         P    37   37
## 23         L    38   38
## 24        TV    39   40
## 25         P    41   41
## 26         I    42   42
## 27         P    43   43
## 28       DDP    44   46
## 29         A    47   47
## 30  RQTRKREA    48   55
## 31         A    56   56
## 32       GDI    57   59
## 33         V    60   60
## 34         F    61   61
## 35      GRGI    62   65
## 36      ERQA    66   69
## 37 NGKGGRNKV    70   78
## 38        QY    79   80
## 39       RTV    81   83
## 40       GQL    84   86
## 41        TL    87   88
## 42        EL    89   90
## 43        EA    91   92
## 44         I    93   93
## 45       NQP    94   96
## 46        RV    97   98
## 47         P    99   99
## 48         S   100  100
print(synthetic_protein2)
## [1] "EVSFLNYIIALLTVQDTWVQFEILEVAVKGRSRLPIPLTVPIPDDPARQTRKREAAGDIVFGRGIERQANGKGGRNKVQYRTVGQLTLELEAINQPRVPS"

Compared to Expasy Peptide Cutter API:

api_cleavage_sites = c(1, 2, 4, 5, 7, 8, 9, 10, 11, 12, 13, 14, 17, 18, 19, 21, 22, 23, 24, 25, 26, 27, 28, 34, 36, 38, 39, 40, 42, 47, 50, 54, 55, 56, 59, 60, 61, 65, 66, 69, 78, 80, 82, 83, 86, 87, 88, 89, 90, 91, 92, 93, 98)
ggdat = data.frame(
  pos = c(result_df$start, api_cleavage_sites), 
  group = rep(c("here", "api"), times = c(length(result_df$start), length(api_cleavage_sites))),
  y = rep(c(1, -1), times = c(length(result_df$start), length(api_cleavage_sites))))

ggplot(ggdat, aes(y = y, x = pos, color = group)) + 
  geom_point() + 
  ylim(-10, 10) + 
  xlab("Amino Acid Position") + 
  ylab(NULL) + 
  theme_bw()

Cleaving masked sequences

Now we can cleave our masked sequences for each group.

max_masked_region_size = 50
min_num_trypsin_sites = 2


complete_masked_sequence = mask_sequence(
    sequence = synthetic_protein,
    delay_fn = sample_delay,
    gap_fn = sample_gap,
    max_masked_region_size = max_masked_region_size,
    min_num_trypsin_sites = min_num_trypsin_sites
  )

g1_indices = downsample_masking(seq_along(complete_masked_sequence$start_vec), downsample_fn, 0.5)
g1_masked_sequence = complete_masked_sequence
g1_masked_sequence$start_vec = g1_masked_sequence$start_vec[g1_indices]
g1_masked_sequence$stop_vec = g1_masked_sequence$stop_vec[g1_indices]
g1_masked_sequence$masked_sequence = remask_sequence(g1_masked_sequence$sequence, g1_masked_sequence$start_vec, g1_masked_sequence$stop_vec)

g2_indices = downsample_masking(seq_along(complete_masked_sequence$start_vec), downsample_fn, 0.5)
g2_masked_sequence = complete_masked_sequence
g2_masked_sequence$start_vec = g2_masked_sequence$start_vec[g2_indices]
g2_masked_sequence$stop_vec = g2_masked_sequence$stop_vec[g2_indices]
g2_masked_sequence$masked_sequence = remask_sequence(g2_masked_sequence$sequence, g2_masked_sequence$start_vec, g2_masked_sequence$stop_vec)
perfect_digest_g1 = simulate_proteinase_k_cleavage(g1_masked_sequence$masked_sequence)
perfect_digest_g2 = simulate_proteinase_k_cleavage(g2_masked_sequence$masked_sequence)
perfect_digest_complete = simulate_proteinase_k_cleavage(complete_masked_sequence$masked_sequence)

insert_char_at_indices = function(string, indices, char = '|') {
  indices = sort(indices)
  indices = indices[indices >= 1 & indices <= nchar(string)]
  
  chars = unlist(strsplit(string, ""))
  
  offset = 0
  for (index in indices) {
    chars = append(chars, char, after = index + offset - 1)
    offset = offset + 1
  }
  
  new_string = paste(chars, collapse = "")
  
  return(new_string)
}

add_cleavage_characters = function(masked_sequence_dat, digest_results, char = '|'){
  masked_sequence_dat$masked_sequence = insert_char_at_indices(masked_sequence_dat$masked_sequence, digest_results$start, char = char)
  masked_sequence_dat$sequence = insert_char_at_indices(masked_sequence_dat$sequence, digest_results$start, char = char)
  
  return(masked_sequence_dat)
}

complete_masked_sequence_perfect_digest = add_cleavage_characters(complete_masked_sequence, perfect_digest_complete)
g1_masked_sequence_perfect_digest = add_cleavage_characters(g1_masked_sequence, perfect_digest_g1)
g2_masked_sequence_perfect_digest = add_cleavage_characters(g2_masked_sequence, perfect_digest_g2)

Plot all potential cleavage sites. ‘|’ denotes a proteinase K cleavage location

create_sequence_plot_with_cleavage = function(list_of_sequences) {
  
  # Get the names of the sequences, or use row numbers if names are not provided
  sequence_names = if (!is.null(names(list_of_sequences))) names(list_of_sequences) else seq_along(list_of_sequences)
  
  plot_data = lapply(seq_along(list_of_sequences), function(i) {
    seq_list = list_of_sequences[[i]]
    sequence = seq_list$sequence
    masked_sequence = seq_list$masked_sequence
    
    data.frame(
      row = i,
      label = sequence_names[i],
      position = 1:nchar(masked_sequence),
      letter = strsplit(sequence, "")[[1]],
      color = ifelse(strsplit(masked_sequence, "")[[1]] == "|", "purple3", 
                     ifelse(strsplit(masked_sequence, "")[[1]] == "#", "red", "black"))
    )
  }) %>% bind_rows()
  
  ggplot(plot_data, aes(x = position, y = -row, label = letter, color = color)) +
    geom_text(size = 4) +
    geom_text(aes(x = -2, y = -row, label = label), hjust = 1, size = 4, color = "black") +
    scale_color_identity() +
    scale_y_continuous(breaks = -seq_along(list_of_sequences), labels = NULL) +
    expand_limits(x = c(-10, NA)) +
    theme_void() +
    theme(legend.position = "none") +
    coord_cartesian(clip = "off")
}
create_sequence_plot_with_cleavage(
  list(
    "Complete Masking" = complete_masked_sequence_perfect_digest, 
    "Group 1 Masking" = g1_masked_sequence_perfect_digest, 
    "Group 2 Masking" = g2_masked_sequence_perfect_digest
    )
  )

As before we can potentially down-sample the cleavage sites to simulate an imperfect digestion

imperfect_digest = function(
    ptm_site_mapping, 
    prop_to_miss#, 
    #peptide_coef
    ){
  n_peps = nrow(ptm_site_mapping)
  
  # coef_dat = data.frame(peptide = names(peptide_coef), Mult_Factor = exp(peptide_coef))
  # ptm_site_mapping = ptm_site_mapping %>% left_join(., coef_dat, by = "peptide")
  
  # TODO does the probability of a successful split depend on the peptide length?
  ptm_site_mapping = ptm_site_mapping %>% mutate(merge_with_next = (seq_len(n_peps) %in% sample(seq_len(n_peps)[-n_peps], round(prop_to_miss * (n_peps - 1)))))
  protein = ptm_site_mapping$protein %>% unique()
  
  remove_vec = rep(FALSE, nrow(ptm_site_mapping))
  ptm_site_mapping = ptm_site_mapping %>% as_tibble()
  
  for(i in seq_len(nrow(ptm_site_mapping) - 1)){
    if(ptm_site_mapping$merge_with_next[i]){
      peptide = paste0(ptm_site_mapping$peptide[i], ptm_site_mapping$peptide[i + 1])
      # mult_factor = ptm_site_mapping$Mult_Factor[i] * ptm_site_mapping$Mult_Factor[i + 1]
      start = ptm_site_mapping$start[i]
      stop = ptm_site_mapping$stop[i + 1]
      remove_vec[i] = TRUE
      merge_with_next = ptm_site_mapping$merge_with_next[i+1]
      # site =  paste(ptm_site_mapping$site[i], ptm_site_mapping$site[i + 1], sep = ';')
      
      
      ptm_site_mapping$peptide[i + 1] = peptide
      # ptm_site_mapping$Mult_Factor[i + 1] = mult_factor
      ptm_site_mapping$start[i + 1] = start
      ptm_site_mapping$stop[i + 1] = stop
      ptm_site_mapping$merge_with_next[i + 1] = merge_with_next
      # ptm_site_mapping$site[i + 1] = site
    }
  }
  return(ptm_site_mapping[!remove_vec,])
}

imperfect_digest_g1 = imperfect_digest(perfect_digest_g1, 0.5)
imperfect_digest_g2 = imperfect_digest(perfect_digest_g2, 0.5)

g1_masked_sequence_imperfect_digest = add_cleavage_characters(g1_masked_sequence, imperfect_digest_g1)
g2_masked_sequence_imperfect_digest = add_cleavage_characters(g2_masked_sequence, imperfect_digest_g2)

Plot imperfect cleavage sites

create_sequence_plot_with_cleavage(
  list(
    "Complete Masking" = complete_masked_sequence_perfect_digest, 
    "Group 1 Masking" = g1_masked_sequence_imperfect_digest, 
    "Group 2 Masking" = g2_masked_sequence_imperfect_digest
    )
  )

“Denature” Protein and perform Trypsin Cleavage

Now we unmask the proteins, and perform (imperfect) trypsin digestion

simulate_trypsin_cleavage = function(sequence) {
  cleavage_aa = c("K", "R")
  skip_aa = "P"
  
  cleavage_rule = function(prev_aa, curr_aa) {
    return(curr_aa %in% cleavage_aa && prev_aa != skip_aa)
  }

  sequence_vector = unlist(strsplit(sequence, ""))

  peptides = c()
  start_indices = c()
  end_indices = c()

  start = 1

  for (i in 1:(length(sequence_vector) - 1)) {
    if (cleavage_rule(sequence_vector[i], sequence_vector[i + 1])) {
      peptides = c(peptides, paste(sequence_vector[start:i], collapse = ""))
      start_indices = c(start_indices, start)
      end_indices = c(end_indices, i)
      start = i + 1
    }
  }

  if (start <= length(sequence_vector)) {
    peptides = c(peptides, paste(sequence_vector[start:length(sequence_vector)], collapse = ""))
    start_indices = c(start_indices, start)
    end_indices = c(end_indices, length(sequence_vector))
  }

  cleavage_df = data.frame(
    peptide = peptides,
    start = start_indices,
    stop = end_indices,
    stringsAsFactors = FALSE
  )

  return(cleavage_df)
}
g1_stage2_digest = simulate_trypsin_cleavage(g1_masked_sequence_imperfect_digest$sequence)
g2_stage2_digest = simulate_trypsin_cleavage(g2_masked_sequence_imperfect_digest$sequence)

g1_imperfect_digest = imperfect_digest(g1_stage2_digest, 0.5)
g2_imperfect_digest = imperfect_digest(g2_stage2_digest, 0.5)

g1_final_sequence = add_cleavage_characters(g1_masked_sequence_imperfect_digest, g1_stage2_digest, "&")
g2_final_sequence = add_cleavage_characters(g2_masked_sequence_imperfect_digest, g2_stage2_digest, "&")
create_sequence_plot_with_cleavage = function(list_of_sequences) {
  
  # Get the names of the sequences, or use row numbers if names are not provided
  sequence_names = if (!is.null(names(list_of_sequences))) names(list_of_sequences) else seq_along(list_of_sequences)
  
  plot_data = lapply(seq_along(list_of_sequences), function(i) {
    seq_list = list_of_sequences[[i]]
    sequence = seq_list$sequence
    masked_sequence = seq_list$masked_sequence
    
    data.frame(
      row = i,
      label = sequence_names[i],
      position = 1:nchar(masked_sequence),
      letter = strsplit(sequence, "")[[1]],
      color = ifelse(strsplit(masked_sequence, "")[[1]] == "&", "orange4", 
                     ifelse(strsplit(masked_sequence, "")[[1]] == "|", "purple3", 
                            ifelse(strsplit(masked_sequence, "")[[1]] == "#", "red", "black")))
    )
  }) %>% bind_rows()
  
  ggplot(plot_data, aes(x = position, y = -row, label = letter, color = color)) +
    geom_text(size = 4) +
    geom_text(aes(x = -2, y = -row, label = label), hjust = 1, size = 4, color = "black") +
    scale_color_identity() +
    scale_y_continuous(breaks = -seq_along(list_of_sequences), labels = NULL) +
    expand_limits(x = c(-10, NA)) +
    theme_void() +
    theme(legend.position = "none") +
    coord_cartesian(clip = "off")
}

… and we can visualize the fully cleaved sequences below. Again ‘|’ denotes a proteinase K cleavage site, while ‘&’ denotes a trypsin cleavage site.

create_sequence_plot_with_cleavage(
  list(
    # "Complete Masking" = complete_masked_sequence_perfect_digest, 
    "Group 1 Masking" = g1_final_sequence, 
    "Group 2 Masking" = g2_final_sequence
    )
  )

Simulating some data

Okay! Now lets actually use all of this to simulate some data we can make use of.

set.seed(3409)
synthetic_proteins = list(
  generate_protein(500, amino_acid_distribution)#,
  # generate_protein(1000, amino_acid_distribution)
  )
max_masked_region_size = 50
min_num_trypsin_sites = 2


mask_all_proteins = function(
    sequences,
    delay_fn,
    gap_fn,
    max_masked_region_size,
    min_num_trypsin_sites){
  out_list = vector('list', length = length(sequences))
  for(i in seq_along(sequences)){
    out_list[[i]] = mask_sequence(
      sequence = sequences[[i]],
      delay_fn = sample_delay,
      gap_fn = sample_gap,
      max_masked_region_size = max_masked_region_size,
      min_num_trypsin_sites = min_num_trypsin_sites
    )
  }
  return(out_list)
}

complete_mask_out = mask_all_proteins(
    sequences = synthetic_proteins,
    delay_fn = sample_delay,
    gap_fn = sample_gap,
    max_masked_region_size = max_masked_region_size,
    min_num_trypsin_sites = min_num_trypsin_sites
  )
downsample_masking = function(
    masking_output,
    downsample_fn, 
    ...
    ){
  indices = downsample_fn(seq_along(masking_output$start_vec), ...)
  
  masking_output$start_vec = masking_output$start_vec[indices]
  masking_output$stop_vec = masking_output$stop_vec[indices]
  masking_output$masked_sequence = remask_sequence(
    masking_output$sequence, 
    masking_output$start_vec, 
    masking_output$stop_vec
    )
  return(masking_output)
}

downsample_maskings = function(
    masking_list,
    downsample_fn,
    ...){
  out_list = vector('list', length = length(masking_list))
  for(i in seq_along(masking_list)){
    out_list[[i]] = downsample_masking(
      masking_output = masking_list[[i]],
      downsample_fn = downsample_fn,
      ...
    )
  }
  return(out_list)
}

g1_mask_out = downsample_maskings(complete_mask_out, downsample_fn, prop = 0.5)
g2_mask_out = downsample_maskings(complete_mask_out, downsample_fn, prop = 0.5)

get_masked_sequences = function(masking_list){
  lapply(masking_list, \(x) x$masked_sequence)
}
get_sequences = function(masking_list){
  lapply(masking_list, \(x) x$sequence)
}

Proteinase K digestion

digest_proteins_PK = function(sequences){
  lapply(sequences, \(x) simulate_proteinase_k_cleavage(x))
}

Trypsin digestion

digest_proteins_trypsin = function(sequences){
  lapply(sequences, \(x) simulate_trypsin_cleavage(x))
}
get_cleavage_site_mapping = function(mask_out){
  digest_results_PK = digest_proteins_PK(get_masked_sequences(mask_out))
  digest_results_tryp = digest_proteins_trypsin(get_sequences(mask_out))
  
  proteinaseK_cleavage_sites = lapply(digest_results_PK, \(x) x$start[-1])
  trypsin_cleavage_sites = lapply(digest_results_tryp, \(x) x$start[-1])
  
  PK_dat = lapply(proteinaseK_cleavage_sites, \(x) data.frame(Site = x, PK = 1))
  TR_dat = lapply(trypsin_cleavage_sites, \(x) data.frame(Site = x, TR = -1))
  full_dat = lapply(
    seq_along(PK_dat), 
    \(x) full_join(PK_dat[[x]], TR_dat[[x]], by = "Site") %>% 
      tidyr::replace_na(., list(PK = 0, TR = 0)) %>%
      mutate(
        sum_col = PK + TR,
        Enzyme = case_match(
        sum_col,
        -1 ~ "Trypsin",
        0 ~ "Both",
        1 ~ "Proteinase K"
      )) %>%
      select(Site, Enzyme) %>%
      arrange(Site)
    )
  
  return(full_dat)
}

g1_cleavage_site_mapping = get_cleavage_site_mapping(g1_mask_out)
g2_cleavage_site_mapping = get_cleavage_site_mapping(g2_mask_out)
#Naive way to introduce imperfect digestion into the cleavage mapping
  #TODO: In reality we want this to happen potentially before creating the mapping for trypsin and PK separately, but that code will need to speed up for that to happen. This is at least good to get some semblence of an end to end pipeline going
get_imperfect_cleavage_site_mapping = function(cleavage_site_mapping, prop_miss){
  for(i in seq_along(cleavage_site_mapping)){
    n_site = nrow(cleavage_site_mapping[[i]])
    w = 1 / diff(c(1, cleavage_site_mapping[[i]]$Site))
    probs = w / sum(w)
    remove_ids = sample(seq_len(n_site), floor(prop_miss * n_site), prob = w)
    cleavage_site_mapping[[i]] = cleavage_site_mapping[[i]][-remove_ids,]
  }
  return(cleavage_site_mapping)
}
#get abundances for each protein
#map those abundances to peptides
#map those peptides to subjects

get_peptides_before_cleavage = function(sequence, cleavage_site_mapping){
  curr_start = 1
  peptides = c()
  for(i in seq_len(nrow(cleavage_site_mapping))){
    curr_stop = cleavage_site_mapping$Site[i] - 1
    peptides = c(peptides, substr(sequence, start = curr_start, stop = curr_stop))
    curr_start = curr_stop + 1
  }
  return(peptides)
}

get_peptides_after_cleavage = function(sequence, cleavage_site_mapping){
  peptides = c()
  for(i in seq_len(nrow(cleavage_site_mapping) - 1)){
    curr_start = cleavage_site_mapping$Site[i]
    curr_stop = cleavage_site_mapping$Site[i + 1] - 1
    peptides = c(peptides, substr(sequence, start = curr_start, stop = curr_stop))
  }
  curr_start = cleavage_site_mapping$Site[i + 1]
  curr_stop = nchar(sequence)
  peptides = c(peptides, substr(sequence, start = curr_start, stop = curr_stop))
  
  return(peptides)
}

get_peptide_abundances = function(sequences, sequence_abundances, cleavage_site_mapping, prop_miss = 0.1){
  cleavage_site_mapping = get_imperfect_cleavage_site_mapping(cleavage_site_mapping, prop_miss)
  prior_peptide_vec = c()
  out = vector("list", length(sequences))
  for(i in seq_along(sequences)){
    peptides_before_cleavage = get_peptides_before_cleavage(
      sequences[[i]],
      cleavage_site_mapping[[i]]
    )
    peptides_after_cleavage = get_peptides_after_cleavage(
      sequences[[i]],
      cleavage_site_mapping[[i]]
    )
    cleavage_site_mapping[[i]]$PriorPeptide = peptides_before_cleavage
    cleavage_site_mapping[[i]]$PosteriorPeptide = peptides_after_cleavage
    cleavage_site_mapping[[i]]$Protein = sequences[[i]]
    out[[i]] = data.frame(peptide = c(peptides_before_cleavage[1], peptides_after_cleavage), start_site = c(1, cleavage_site_mapping[[i]]$Site), abundance = sequence_abundances[i])
  }
  return(list(abundances = out, cleavage_site_mapping = cleavage_site_mapping))
}

sequence_abundances = c(1)

#generate subject effects
n_subjects_per_group = 500
n_groups = 2

generate_group_data = function(n, group_id, sequences, sequence_abundances, cleavage_site_mapping, prop_miss = 0.1){
  peptide_data = vector("list", n)
  cleavage_data = vector("list", n)
  subject_ids = paste0("Subject_", seq_len(n), "_", group_id)
  for(i in seq_len(n)){
    out = get_peptide_abundances(
      sequences,
      sequence_abundances,
      cleavage_site_mapping,
      prop_miss
      )
    peptide_data[[i]] = reduce(out$abundances, rbind)
    cleavage_data[[i]] = reduce(out$cleavage_site_mapping, rbind)
    names(peptide_data[[i]])[3] = subject_ids[i]
  }
  peptide_data = reduce(peptide_data, full_join, by = c("peptide", "start_site"))
  cleavage_data = reduce(cleavage_data, rbind)
  return(list(peptide_data = peptide_data, cleavage_data = cleavage_data))
}


g1_full_out = generate_group_data(
  n_subjects_per_group, 
  1,
  synthetic_proteins,
  sequence_abundances,
  g1_cleavage_site_mapping,
  prop_miss = 0.5)

g2_full_out = generate_group_data(
  n_subjects_per_group, 
  2,
  synthetic_proteins,
  sequence_abundances,
  g2_cleavage_site_mapping,
  prop_miss = 0.5)

full_e_data = full_join(
  g1_full_out$peptide_data,
  g2_full_out$peptide_data,
  by = c("peptide", "start_site"))
full_cleavage_mapping = full_join(
  g1_full_out$cleavage_data %>% select(-Protein) %>% distinct,
  g2_full_out$cleavage_data %>% select(-Protein) %>% distinct
  ) %>% mutate(
    group_1 = 0,
    group_2 = 0
  )

G1_idx = 3:(2 + n_subjects_per_group)
G2_idx = 3:(2 + n_subjects_per_group) + n_subjects_per_group
for(rr in seq_len(nrow(full_cleavage_mapping))){
  tmp_vec = full_cleavage_mapping[rr,]
  site = tmp_vec$Site
  priorpep = tmp_vec$PriorPeptide
  postpep = tmp_vec$PosteriorPeptide
  
  tmp_vec$group_1 = mean(!is.na(full_e_data[,c(1, 2, G1_idx)] %>% filter(peptide == postpep & start_site == site) %>% select(-peptide, -start_site)))
  tmp_vec$group_2 = mean(!is.na(full_e_data[,c(1, 2, G2_idx)] %>% filter(peptide == postpep & start_site == site) %>% select(-peptide, -start_site)))
  full_cleavage_mapping[rr,] = tmp_vec
}

rolled_up = full_cleavage_mapping %>% group_by(Site, Enzyme) %>% summarise(group_1 = mean(group_1), group_2 = mean(group_2))
ggdat = rolled_up %>% pivot_longer(., c("group_1", "group_2"), values_to = "prop_detected", names_to = "group")

ggdat = rolled_up %>% mutate(diff = group_1 - group_2)

masking_data = data.frame(
  Site = seq_len(nchar(g1_mask_out[[1]]$sequence)),
  group_1 = strsplit(g1_mask_out[[1]]$masked_sequence, '')[[1]],
  group_2 = strsplit(g2_mask_out[[1]]$masked_sequence, '')[[1]]) %>%
  mutate(
    cat = paste0(group_1, group_2),
    category = case_when(
      cat == "##" ~ "Both",
      grepl("#[A-Za-z]", cat) ~ "Group 1 Only", # Matches "#x" where "x" is any letter
      grepl("[A-Za-z]#", cat) ~ "Group 2 Only", # Matches "x#" where "x" is any letter
      grepl("[A-Za-z][A-Za-z]", cat) ~ "Neither" # Matches "xx" where both are any letters
    )
  )

# Create a complete sequence of Sites
all_sites <- data.frame(Site = seq(min(masking_data$Site), max(masking_data$Site)))

# Merge all_sites with ggdat and masking_data
plot_data <- all_sites %>%
  left_join(ggdat, by = "Site") %>%
  left_join(masking_data, by = "Site")

# Separate the data for the line plot (where diff is not NA)
line_data <- plot_data %>%
  filter(!is.na(diff))

# Creating the plot
ggplot() + 
  geom_tile(data = plot_data, aes(x = Site, y = (max(line_data$diff, na.rm = TRUE) + min(line_data$diff, na.rm = TRUE)) / 2, fill = category), height = max(line_data$diff, na.rm = TRUE) - min(line_data$diff, na.rm = TRUE), alpha = 0.2) + 
  geom_line(data = line_data, aes(x = Site, y = diff)) + 
  theme_bw() +
  scale_fill_manual(values = c("Neither" = "gray", "Both" = "blue", "Group 1 Only" = "orange", "Group 2 Only" = "green")) +
  labs(fill = "Region Masked?") + 
  ylab("Difference in Proportion of Samples with Cleavage")

knitr::knit_exit()